Preprocessing Dataset
Libraries
Library for read dataset.
library(readr)Library for data frames processing.
library(dplyr)
library(tidyr)Library for R Markdown.
library(rmarkdown)
library(knitr)Library for data presentation.
library(scales)Library for manage strings.
library(stringr)Load dataset
Set the path of the dataframe file.
path = "../virusTotal/data/virusTotal.csv"Load dataset.
df <- read_csv(path)Statistics
Dimensions.
dim(df)## [1] 183 447
Types
View witch types are in the dataset.
col_types_all <-
df %>%
sapply(typeof) %>%
unlist()
col_types_table <-
col_types_all %>%
table()
col_types <-
col_types_table %>%
as.vector()
names(col_types) <- names(col_types_table)## character double logical
## 158 204 85
As can be seen there are the three expected types: character, double and logical.
NA
Percentaje of NA values
Define function to see the amount of NA values in the dataframe.
percent_of_NA <-
function(df){
num_of_NA <-
df %>% is.na() %>% sum()
num_of_values <-
df %>% dim() %>% prod()
percent_of_NA <-
(num_of_NA / num_of_values) %>%
percent()
return(percent_of_NA)
}percent_of_NA(df)## [1] "37%"
Columns with NA
Define functions to see the NA in columns.
num_of_NA_by_column <-
function(df){
df %>% is.na() %>% colSums()
}remove_0 <-
function(x) x[x!=0]names_of_colums_with_NA <-
function(df)
df %>%
num_of_NA_by_column() %>%
remove_0 %>%
names()percentaje_of_cols_with_NA <-
function(df)
(length(names_of_colums_with_NA(df)) / ncol(df)) %>%
percent()Compute the percentaje of cols with NA.
percentaje_of_cols_with_NA(df)## [1] "74%"
Inspect if there are columns full of NA.
is_full_of_NA <- function(col){
num_of_NA <-
col %>%
is.na() %>%
sum()
return(num_of_NA == length(col))
}cols_full_of_NA <-
df %>%
select_if(is_full_of_NA) %>%
names()## [1] "authentihash" "scans.Bkav.result"
## [3] "scans.CMC.result" "scans.ALYac.result"
## [5] "scans.Malwarebytes.result" "scans.K7AntiVirus.result"
## [7] "scans.Baidu.result" "scans.SUPERAntiSpyware.result"
## [9] "scans.Gridinsoft.result" "scans.ViRobot.result"
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"
## [13] "scans.VBA32.result" "scans.Zoner.result"
## [15] "scans.Panda.result" "scans.Elastic.result"
## [17] "scans.Cylance.result" "scans.SentinelOne.result"
As can be seen there are many columns that are full of NA, so can be deleted.
df <-
select(df, -all_of(cols_full_of_NA))Colums with the same value
Maybe there are columns that has the same value along all the vector, so are useless.
Define function to remove these columns.
different_values <-
function(x)
x %>% na.omit() %>% unique() %>% length()remove_columns_with_the_same_value <-
function(df)
select_if(df, function(col) different_values(col) > 1)Apply function.
num_of_cols_after_remove <-
df %>%
remove_columns_with_the_same_value() %>%
ncol()Calculate the number of columns with same value.
ncol(df) - num_of_cols_after_remove## [1] 147
Awesom! Many colums found. Let’s remove them.
df <-
remove_columns_with_the_same_value(df)Inspecting dataframe
Now let’s deeply inspect into the dataframe.
Permissions (PERM)
Replace NA for 0 into permissions colums
The permissions columns (PERM) seems that there are NAs where there should be 0s. So it would be better to replace them.
replace_na_which_colname_match <-
function(df, pattern, replacement){
cols_to_replace <-
df %>%
colnames() %>%
str_which(pattern)
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}replace_na_when <-
function(df, fun, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}pattern <- "additional_info.androguard.RiskIndicator.PERM"
df <-
df %>%
replace_na_which_colname_match(pattern, 0)
df %>%
select(., str_which(colnames(.), pattern)) %>%
paged_table()New total permissions column
pattern <- "additional_info.androguard.RiskIndicator.PERM"
df_without_permissions <-
df %>%
select(., -(str_which(colnames(.), pattern)))
df_permissions <-
df %>%
select(., str_which(colnames(.), pattern)) %>%
mutate(., total_PERMs = rowSums(.))
df <- cbind(df_without_permissions, df_permissions)Removing cols
Dates
There are many dates in the dataset, that are not relevant for virus analysis. So let’s remove them.
Define a function for check if a col is of type Date.
not <-
function(x) !x
get_element <-
function(x, index) x[index]
is_date_col <-
function(col)
col %>%
as.character() %>%
na.omit() %>%
get_element(1) %>%
str_split(" ") %>%
unlist() %>%
get_element(1) %>%
as.Date(format = "%Y-%m-%d") %>%
is.na() %>%
not()Columns detected.
df %>%
select_if(is_date_col) %>%
head() %>%
paged_table()Define function for remove cols by a predicate.
remove_col_if <-
function(df, fun){
cols_to_delete <-
df %>%
select_if(fun) %>%
colnames()
df <-
df %>%
select(-cols_to_delete)
return(df)
}Remove them.
df <-
remove_col_if(df, is_date_col)## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols_to_delete)` instead of `cols_to_delete` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Scans
There are many scans of different antivirus, that has the same information.
There are many duplicated cols, hashes, dates,
cols_to_delete <- c(
)Save dataframe
After all preprocessing let’s save it into CSV.
write.csv(df, path)Functions for preprocessing
As factor
labels <-
function(n){
if(n == 5){
return(c("very low", "low", "medium", "high", "very high"))
}else if(n == 4){
return(c("very low", "low", "high", "very high"))
}else if(n == 3){
return(c("low", "medium", "high"))
}else if(n == 2){
return(c("low", "high"))
}else{
stop("Not avalible")
}
}
cut_by_quantiles <-
function(col){
quantiles <-
col %>%
quantile(na.rm = TRUE) %>%
unique()
if(length(quantiles) > 2){
col <-
col %>%
cut(breaks = quantiles,
labels = labels(length(quantiles)-1),
include.lowest = TRUE)
}
return(col)
}
df_cut_by_quantiles <-
function(df){
df_without_numeric <-
df[sapply(df, function(col) !is.numeric(col))]
df_numeric <-
df %>%
select_if(is.numeric)
df_numeric <-
df_numeric %>%
lapply(cut_by_quantiles)
return(cbind(df_without_numeric, df_numeric))
}